home *** CD-ROM | disk | FTP | other *** search
/ isnet Internet / Isnet Internet CD.iso / prog / hiz / 09 / 09.exe / adynware.exe / perl / lib / adynware / web_site.pm < prev   
Encoding:
Perl POD Document  |  1999-12-28  |  9.9 KB  |  306 lines

  1. package web_site;
  2. use strict;
  3. #use diagnostics;
  4. use IO::Socket;
  5. use LWP::Simple;
  6. use LWP::UserAgent;
  7.  
  8. my %verifiedLinks = ();
  9.  
  10. sub Find
  11. {
  12.         my $self = shift;
  13.         my($target) = @_;
  14.         print "web_site::Find($target)\n";        
  15.                 
  16.         $self->Traverse($target, 
  17.                 
  18.         sub 
  19.         {
  20.                 my($name, $content) = @_; 
  21.                 #print "web_site::Find:$name\n";
  22.         });
  23. }
  24.  
  25. sub GetLinks
  26. {
  27.         my($base, $page) = @_;
  28.         #print "GetLinks($base)\n";
  29.         die "web_site::GetLinks cannot see protocolAndHost in $base" unless $base =~ m{([^/]*//[^/]*)/};
  30.         my $protocolAndHost = $1;
  31.         my @links = ();
  32.                                                                                         
  33.         my $tags = "(href|src|action)";
  34.         while ($page =~ m
  35.         {
  36.                 $tags\s*=\s*
  37.                 "([^"]+)"
  38.         }gisx)
  39.         {
  40.                 #print "match:3:$2\n";
  41.                 push(@links, web_site::makeAbsolute($2, $base, undef));
  42.         }
  43.                         
  44.         while ($page =~ m
  45.         {
  46.                 $tags\s*=\s*
  47.                 '([^']+)'
  48.         }gisx)
  49.         {
  50.                 #print "match:4:$2\n";
  51.                 push(@links, web_site::makeAbsolute($2, $base, undef));
  52.         }
  53.                         
  54.         while ($page =~ m
  55.         {
  56.                 $tags\s*=\s*
  57.                 ([^\s>'"]+)
  58.         }gisx)
  59.         {
  60.                 #print "match:5:$2\n";
  61.                 push(@links, web_site::makeAbsolute($2, $base, undef));
  62.         }                          
  63.         return @links;
  64. }
  65.  
  66.  
  67. sub InvalidLink
  68. {
  69.         my($from, $link, $statusCode, $message) = @_;
  70.         $from = "(null)" unless defined $from;
  71.         print "web_site::InvalidLink($from, $link, $statusCode, $message)\n";
  72. }
  73.  
  74. sub Traverse
  75. {
  76.         my $self = shift;
  77.         my($target, $function) = @_;
  78.         my @in = ($target);
  79.         my %out = ();
  80.         %verifiedLinks = ();
  81.                                                                 
  82.         while(scalar(@in))
  83.         {
  84.                 my $currentTarget = pop @in;
  85.                 next if defined $out{$currentTarget};
  86.                 $out{$currentTarget} = 1;
  87.                                                                                 
  88.                 my $response = $self->{"agent"}->request(new HTTP::Request("GET", $currentTarget));
  89.                 my $content = $response->content();
  90.                 die "could not retrieve $currentTarget" unless defined $content; 
  91.                                                                                 
  92.                 &$function($currentTarget, $content) if defined $function;
  93.                                                                                                                                 
  94.                 my $base = $response->base();
  95.                                                                 
  96.                 my @links = GetLinks($base, $content);
  97.                 foreach my $link (@links)
  98.                 {
  99.                         next if defined $out{$link} or defined $verifiedLinks{$link};
  100.                         my $content_type = $self->GetContentType($link, $currentTarget);
  101.                         if ($content_type and $content_type eq "text/html" and $link =~ /^$target/)
  102.                         {
  103.                                 #print "should pursue $link\n";
  104.                                 push(@in, $link);
  105.                         }
  106.                 }
  107.         }
  108. }
  109.  
  110. sub ValidLink
  111. {
  112.         my($from, $link) = @_;
  113.         $from = "(null)" unless defined $from;
  114.         $verifiedLinks{$link} = 1;
  115.         print "web_site::ValidLink($link)\n";
  116. }
  117.  
  118. sub GetContentType
  119. {
  120.         my $self = shift;
  121.         my($link, $from) = @_;
  122.         
  123.         my $response = $self->{"agent"}->request(new HTTP::Request("HEAD", $link));
  124.                 
  125.         if ($response->is_error())
  126.         {
  127.                 $response = $self->{"agent"}->request(new HTTP::Request("GET", $link));
  128.         }
  129.         if ($response->is_error())
  130.         {
  131.                 InvalidLink($from, $link, $response->code(), $response->message());
  132.                 return undef;
  133.         }
  134.         $verifiedLinks{$link} = 1;
  135.         print "web_site::GetContentType($link): ok\n";
  136.         ValidLink($from, $link);
  137.         return $response->content_type();
  138. }
  139.  
  140. sub CheckLink
  141. {
  142.         my($self, $link, $from) = @_;
  143.         return 1 if defined $verifiedLinks{$link};
  144.         return defined $self->GetContentType($link, $from); 
  145. }
  146.  
  147. sub Verify_list_of_links
  148. {
  149.         my($list) = @_;
  150.         my $agent = new LWP::UserAgent;
  151.         foreach my $link (@$list)
  152.         {
  153.                 CheckLink($agent, $link, undef);
  154.         }
  155. }
  156.  
  157. sub ResolveBase
  158. {
  159.         my($response) = @_;
  160.         return undef unless defined $response;
  161.         
  162.         # if the response is from a https query, then the request can be null.
  163.         # In this case, calling response->base() crashes perl.  So check:
  164.         return undef unless defined $response->request;
  165.         return ResolveBase2($response->content(), $response->base());
  166. }
  167.  
  168. sub ResolveBase2
  169. {
  170.         my($content, $headerBase) = @_;
  171.         return $headerBase unless $content =~ s/<\s*base\s*href\s*=\s*['"]?(.*?)['"]?\s*>//ims;
  172.         my $contentBase = $1;
  173.         return web_site::makeAbsolute($contentBase, $headerBase, undef);
  174. }
  175.  
  176.  
  177. sub CachedGet
  178. {
  179.         my($cacheDirectory, $target) = @_;
  180.                 
  181.         return undef unless $target =~ m{(.*)/([^/]*)$};
  182.         my $dirname = utility_file::flattenURL($1);
  183.         my $basename = $2;
  184.                         
  185.         $dirname = "$cacheDirectory/$dirname" if $cacheDirectory;
  186.                                 
  187.         $basename = "index.htm" unless $basename;
  188.                         
  189.         mkdir($dirname, 777) unless -d $dirname;
  190.         my $fileName = "$dirname/$basename";
  191.         #print "cached get: cache directory is $cacheDirectory; dir name is $dirname; base name is $basename\n"; exit();
  192.         
  193.         
  194.         if (-f "$fileName.raw")
  195.         {
  196.                 my $content = utility_file::getContent("$fileName.raw");
  197.                 die "could not extract base from $fileName.raw" unless $content =~ s/^base is (.*)//;
  198.                 my $base = $1;
  199.                 return ($content, $base, $fileName);
  200.         }
  201.         my($content, $base, $content_type) = Get($target);
  202.         utility_file::setContent("$fileName.raw", "base is $base\n$content");
  203.         return ($content, $base, $content_type, $fileName);
  204. }
  205. sub Get
  206. {
  207.         my($target) = @_;
  208.         my $agent = new LWP::UserAgent;
  209.         my $response = $agent->request(new HTTP::Request("GET", $target));
  210.         return undef unless defined $response and $response;
  211.                 
  212.         my $location = $response->header('Content-Location');
  213.         my $basename;
  214.         if ($location)
  215.         {
  216.                 $basename = utility_file::basename($location);
  217.         }
  218.         else
  219.         {
  220.                 $basename = utility_file::basename($target);
  221.                 $basename = "index.htm" unless $basename;
  222.         }
  223.         
  224.         my $content = $response->content();
  225.         
  226.         if (!$content)
  227.         {
  228.                 print "web_site.pm: Get($target) failure: ", $response->code(), ": ", $response->message(), "\n";
  229.         }
  230.         else
  231.         {
  232.                 print "web_site.pm: Get($target): ", $response->content_type(), " ", $response->code(), ": ", $response->message(), " to $basename\n";
  233.         }
  234.         return ($content, web_site::ResolveBase($response), $response->content_type(), $basename);
  235. }
  236.  
  237. sub new
  238. {
  239.         #web_site::Get("http://o266/r?pageid=idx-home&idx&comefrom=idx-ad&proxyad"); exit();
  240.  
  241.         my $class = shift;
  242.         my $self = {};
  243.         bless $self, $class;
  244.         $self->{"agent"} = new LWP::UserAgent;
  245.         return $self;
  246. }
  247.  
  248. sub makeAbsolute
  249. {
  250.         my($oldLink, $base, $current) = @_;
  251.         $base = "" unless defined $base;
  252.         $current = "" unless defined $current;
  253.                 
  254.         #print "web_site::makeAbsolute($oldLink, $base, $current)\n";
  255.         ##return $oldLink if !$base or $base eq "." or $oldLink =~ m{^[^/]+:};
  256.                         
  257.         my $link = $oldLink;
  258.                                 
  259.         # looksmart.com has URL references which begin with http:/whatever; should be interpreted as /whatever
  260.         $link =~ s{\w+:/([^/])}{$1};
  261.                                                 
  262.         if ($link =~ /^#/)
  263.         {
  264.                 $current =~ s/#.*//;
  265.                 $link = $current . $link;
  266.         }
  267.         #print "sending $link, $base\n";
  268.         my $absolute = new URI::URL($link, $base)->abs();    
  269.         
  270.         $absolute =~ s/&/&/g;
  271.         
  272.         $absolute =~ s{^([^:]+://[^/]+/)(\.\./)+}{$1}g;  # simply eliminate  leading "../" -- that's what navigator does
  273.         $absolute =~ s{/[^/]+/\.\./}{/}g;
  274.         
  275.         #print "makeAbsolute($oldLink, $base) yielded $absolute\n";
  276.         return $absolute;
  277. }
  278.  
  279.  
  280.  
  281. #my $w = web_site->new();
  282. #$w->Find("http://www.sterls.com/");
  283. #$w->Find("http://www.adyn.com");
  284. #$w->Find("http://www.adyn.com/k.html");
  285. #$w->Find("http://www.adyn.com/spinach/faq.html");
  286. #$w->Find("http://www.slip.net/");
  287. #$w->Find("http://home.netscape.com/");
  288.  
  289. #my $agent = new LWP::UserAgent;
  290. #my $request = new HTTP::Request("GET", "http://www.sterls.com/");
  291. #my $response = $agent->request($request);
  292. #my $content = $response->content();
  293. #my $header = $response->base();
  294. #
  295. #my $x = new URI::URL("../spinach/spinach.html", "http://www.adyn.com/etc/link.html");
  296. #print $x->abs()->as_string();
  297. #$x = new URI::URL("#whatever", "http://www.adyn.com/etc/link.html");
  298. #print $x->abs()->as_string();
  299.  
  300. my $x;
  301.  $x = makeAbsolute("http://www.m-w.com/cgi-bin/dictionary?book=Dictionary&va=bloviate", "http://www.m-w.com/cool/newwords", undef);
  302.  $x = makeAbsolute("../../index.html", "http://www.m-w.com/", undef);
  303.  $x = makeAbsolute("#offset", "http://www.m-w.com/", "http://www.adyn.com/index.html#initial");
  304.  $x = makeAbsolute("36.html", "http://www.geek-girl.com/emacs/faq/", "http://www.geek-girl.com/emacs/faq/index.html");
  305.  1;
  306.